package cn.haut.online_read.crawler;

import lombok.extern.slf4j.Slf4j;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.List;

/**
 * @author xing
 * @date 2021-04-21 13:19
 */
@Slf4j
public class BookChapterContentCrawler implements PageProcessor {

    private Site site = Site.me().setSleepTime(1);

    @Override
    public void process(Page page) {
        List<String> paragraph = page.getHtml().xpath("//div[@class='inner-text']/p/text()").all();
        String chapterName = page.getHtml().xpath("//ul[@class='page-route']/li[3]/text()").get();
        page.putField("paragraphs", paragraph);
        page.putField("chapterName", chapterName);
    }

    @Override
    public Site getSite() {
        return site;
    }
}
